## 2016年暑期课程设计
## 问题:Grupo Bimbo Inventory Demand
## 宾堡集团的库存需求
## 最大限度地提高销售和最大限度地减少烘焙食品的退回
## Daitu
## start:2016.06.21
## 参考借鉴kaggle上的公开程序
setwd("/Users/Daitu/数据分析/kaggle/Grupo Bimbo")
getwd()
## [1] "/Users/daitu/数据分析/kaggle/Grupo Bimbo"
## 加载包
library(data.table)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.4
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(treemap)
## Warning: package 'treemap' was built under R version 3.2.4
## 读取数据####
## 1:读取训练集
system.time({
traindata <- fread("train.csv",sep=",",header = TRUE)
})
##
Read 0.0% of 74180464 rows
Read 4.4% of 74180464 rows
Read 8.8% of 74180464 rows
Read 13.3% of 74180464 rows
Read 17.6% of 74180464 rows
Read 22.1% of 74180464 rows
Read 26.5% of 74180464 rows
Read 30.7% of 74180464 rows
Read 34.5% of 74180464 rows
Read 38.2% of 74180464 rows
Read 42.3% of 74180464 rows
Read 46.1% of 74180464 rows
Read 50.2% of 74180464 rows
Read 54.6% of 74180464 rows
Read 58.7% of 74180464 rows
Read 63.0% of 74180464 rows
Read 67.3% of 74180464 rows
Read 71.6% of 74180464 rows
Read 76.0% of 74180464 rows
Read 80.3% of 74180464 rows
Read 84.5% of 74180464 rows
Read 89.0% of 74180464 rows
Read 93.1% of 74180464 rows
Read 97.5% of 74180464 rows
Read 74180464 rows and 11 (of 11) columns from 2.980 GB file in 00:00:30
## user system elapsed
## 27.100 2.231 30.947
# Read 74180464 rows and 11 (of 11) columns from 2.980 GB file in 00:00:29
# 用户 系统 流逝
# 26.553 2.077 28.869
## 2:读取客户名单数据
cliente_tabla <- fread("cliente_tabla.csv",sep=",",header = TRUE)
## 3:读取产品名单数据
producto_tabla <- fread("producto_tabla.csv",sep=",",header = TRUE)
## 4:读取城镇和国家(州)数据
town_state <- fread("town_state.csv",sep=",",header = TRUE)
## 数据的描述统计####
## 1:分析数据的周数:Semana
Semana <- data.frame(table(traindata$Semana))
colnames(Semana) <- c("Semana","Freq")
# 3 4 5 6 7 8 9
# 11165207 11009593 10615397 10191837 10382849 10406868 10408713
## 条形图
ggplot(data = Semana,aes(Semana,Freq)) +
geom_bar(stat = "identity", width = 0.6,fill = "lightblue",colour = "black") +
theme_grey(base_family = "STKaiti") +
scale_y_continuous() +
labs(x="周数",y="数据条数",title = "每周数据的记录数")

## 销售站的数据分析#####
## 2 :销售站Agencia 和 州State
agencias <- traindata %>%
group_by(Agencia_ID) %>% # 数据按照销售战进行分组统计
summarise(Units = sum(Venta_uni_hoy), # 总结多个值为一个值,units:本销售站的销量和
Pesos = sum(Venta_hoy), # 本周的销售量(比索)之和
Return_Units = sum(Dev_uni_proxima), # 下星期的返回量之和
Return_Pesos = sum(Dev_proxima), # 下星期的返回量(比索)之和
Net = sum(Demanda_uni_equil)) %>% # 调整后的需求和
mutate(Net_Pesos = Pesos - Return_Pesos, # mutate:添加新的变量
Return_Rate = Return_Units / (Units+Return_Units)) %>% # 添加变量退货比率
arrange(desc(Units)) %>% # 将数据按照变量Units的降序排列
inner_join(town_state, by="Agencia_ID") # 按照变量Agencia_ID,连接两个表,return all rows from x
## 可视化x:每天销量,y:销售站的数量
ggplot(agencias, aes(x=Units/7))+
geom_histogram(fill="red", color="gray", binwidth=10000)+ #条形图的宽度为10000
theme_bw(base_family = "STKaiti") +
scale_x_continuous(labels=function(x)paste(x/1000, "k"))+
labs(x = "每天的平均销量",y = "销售站数量",title = "销售站的销量")

## 前100的销售站的销量树图
treemap(agencias[1:100, ],
index=c("Agencia_ID"), vSize="Units", vColor="Return_Rate",
type="value", title.legend="Units return %", title="Top 100 agencias")

## 销售站的历史数据分析
agencias_history <- traindata %>%
group_by(Agencia_ID, Semana) %>% # 数据按照销售站、星期数,进行分组统计
summarise(Units = sum(Venta_uni_hoy), # 总结多个值为一个值,units:销量和
Pesos = sum(Venta_hoy),
Return_Units = sum(Dev_uni_proxima),
Return_Pesos = sum(Dev_proxima),
Net = sum(Demanda_uni_equil)) %>%
mutate(Net_Pesos = Pesos - Return_Pesos,
Avg_Pesos = Pesos / Units,
Return_Rate = Return_Units / (Units+Return_Units)) %>%
arrange(Agencia_ID, Semana) %>% # 将数据按照变量 销售站、星期数的降序排列
inner_join(town_state, by="Agencia_ID")
## 取出销售量前30的销售站ID
top30agencias <- agencias$Agencia_ID[1:30]
## 销量前30的销售站���每周的销量和退货率图
ggplot(agencias_history %>% filter(Agencia_ID %in% top30agencias))+
geom_bar(aes(x=Semana, y=Units, fill=Return_Rate), stat="identity", color="black")+
theme_bw(base_family = "STKaiti") +
facet_wrap(~Agencia_ID)+ # 按照销售站划分成子图
scale_y_continuous(labels=function(x)paste(x/1000, "k"))+
scale_fill_gradient(name="退回\n百分比", low="white", high="red")+
ggtitle("销量前30的销售站") + ylab("销量") +xlab("星期")

## 每个州的销售数据的分析
states <- agencias_history %>%
group_by(State, Semana) %>% #数据按照州和星期分组
summarise(Units = sum(Units),
Pesos = sum(Pesos),
Return_Units = sum(Return_Units),
Return_Pesos = sum(Return_Pesos),
Net = sum(Net)) %>%
mutate(Avg_Pesos = Pesos / Units,
Return_Rate = Return_Units / (Units+Return_Units)) %>%
arrange(desc(Units)) # 数据按照销量排列
## 地点--星期 --退回百分比 图像可视化
ggplot(states)+
geom_bar(aes(x=Semana, y=Units, fill=Return_Rate), stat="identity", color="black")+
theme_bw(base_family = "STKaiti") +
facet_wrap(~State)+
scale_y_continuous(labels=function(x)paste(x/1e6, "m"))+
scale_fill_gradient(name="退回\n百分比", low="white", high="red")+
ggtitle("州的销售量")+ ylab("销量") +xlab("星期")

## 销售渠道的分析####
canals <- traindata %>%
group_by(Canal_ID, Semana) %>% #根据销售渠道和星期进行分组
summarise(Units = sum(Venta_uni_hoy),
Pesos = sum(Venta_hoy),
Return_Units = sum(Dev_uni_proxima),
Return_Pesos = sum(Dev_proxima),
Net = sum(Demanda_uni_equil)) %>%
mutate(Net_Pesos = Pesos - Return_Pesos,
Avg_Pesos = Pesos / Units,
Return_Rate = Return_Units / (Units+Return_Units)) %>%
arrange(desc(Units))
# 销售渠道1占据主要的销量
treemap(canals, index=c("Canal_ID"), vSize="Units", type="index",
title="Canals repartition")

## 销售渠道的销量和星期和退货率
ggplot(canals)+
geom_bar(aes(x=Semana, y=Units, fill=Return_Rate), stat="identity", color="black")+
theme_bw(base_family = "STKaiti") +
facet_wrap(~Canal_ID, scale="free")+
scale_y_continuous(labels=function(x)paste(x/1e6, "m"))+
scale_fill_gradient(name="退回\n百分比", low="white", high="red")+
ggtitle("销售渠道")+ ylab("销量") +xlab("星期")

## 销售渠道和销售站分析####
agencias_canals <- traindata %>%
group_by(Agencia_ID) %>%
summarise(n_canals = n_distinct(Canal_ID)) #添加该销售渠道有多少销售站
## 销售渠道有多少销售站可视化
ggplot(agencias_canals)+
geom_histogram(aes(x=n_canals), fill="red", color="black", alpha="0.5", binwidth=0.5)+
theme_bw(base_family = "STKaiti") +
scale_x_continuous(breaks=1:5)+
scale_y_continuous()+
theme(axis.text.x=element_text(hjust=1)) +
labs(x = "销售渠道数量",y = "销售站数量",title = "销售站的销售渠道量")

# 大部分销售站有1条销售渠道,只有很少的销售站有超过三条的销售渠道
## 销售路线的分析####
routes <- traindata %>% group_by(Ruta_SAK) %>%
summarise(n_Agencias = n_distinct(Agencia_ID), #销售路线有多少销售站
n_Clients = n_distinct(Cliente_ID), #销售路线有多少客户
Units=sum(Venta_uni_hoy), #销售路线的销售量
Return_Units = sum(Dev_uni_proxima)) %>% #销售路线的销售量退货量
mutate(Return_Rate = Return_Units / (Units+Return_Units)) %>% # 添加退货率变量
arrange(desc(Units)) # 按照销量排序
ggplot(routes, aes(x=Units/7))+
geom_histogram(fill="red", color="gray", binwidth=5000)+
theme_bw(base_family = "STKaiti") +
scale_x_continuous(labels=function(x)paste(x/1000, "k"))+
scale_y_continuous()+
labs(x = "平均每天销量",y = "销售路线数量")

## 大部分的销售路线的销售量并不多,超过2/3的销售路线每天的销售量不超过10千
## 销售路线和销售站#####
routes_agencias <- traindata %>% group_by(Ruta_SAK, Agencia_ID) %>%
summarise(count=n(), #当前分组的观测数
n_Clients = n_distinct(Cliente_ID), # 客户数量
Units=sum(Venta_uni_hoy), #销售量求和
Return_Units = sum(Dev_uni_proxima)) %>%
mutate(Return_Rate = Return_Units / (Units+Return_Units)) %>%
arrange(desc(Units))
top100routes <- routes$Ruta_SAK[1:100] # 销量前100的路线
top100agencias <- agencias$Agencia_ID[1:100] # 销量前100的销售站
## 可视化
ggplot(routes_agencias %>%
filter(Ruta_SAK %in% top100routes, Agencia_ID %in% top100agencias))+
geom_point(aes(x=as.character(Ruta_SAK),
y=as.character(Agencia_ID),
size=Units, color=Return_Rate))+
theme_bw(base_family = "STKaiti") +
scale_color_gradient(name="退回\n百分比", low="blue", high="red")+
scale_size_continuous(name = "销量",range = c(1,4)) +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank()) +
labs(x = "销售路线",y = "销售站",title = "销量前100的销售站&销售路线")

top50routes <- routes$Ruta_SAK[1:50] # 销量前100的路线
top50agencias <- agencias$Agencia_ID[1:50] # 销量前100的销售站
## 可视化
ggplot(routes_agencias %>%
filter(Ruta_SAK %in% top50routes, Agencia_ID %in% top50agencias))+
geom_point(aes(x=as.character(Ruta_SAK),
y=as.character(Agencia_ID),
size=Units, color=Return_Rate))+
theme_bw(base_family = "STKaiti") +
scale_color_gradient(name="退回\n百分比", low="blue", high="red")+
scale_size_continuous(name = "销量",range = c(1,6)) +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank()) +
labs(x = "销售路线",y = "销售站",title = "销量前50的销售站&销售路线")

## 对客户数据进行分析#####
sales <- traindata %>% #客户数据
group_by(Cliente_ID) %>% # 按照客户id分组
summarise(Units = sum(Venta_uni_hoy),
Pesos = sum(Venta_hoy), # 本周销售金额
Return_Units = sum(Dev_uni_proxima),
Return_Pesos = sum(Dev_proxima), # 下星期的退回金额
Net = sum(Demanda_uni_equil)) %>%
mutate(Return_Rate = Return_Units / (Units+Return_Units),
Avg_Pesos = Pesos / Units) %>% # 单价
mutate(Net_Pesos = Pesos - Return_Pesos) %>% # 实际销售金额
inner_join(cliente_tabla, by="Cliente_ID") %>%
arrange(desc(Pesos)) # 本周销售金额排序
# 花费量前100个客户的树形图
# 可见有一个大客户:Puebla Remision
treemap(sales[1:100, ],
index=c("NombreCliente"), vSize="Units", vColor="Return_Rate",
palette=c("#FFFFFF","#FFFFFF","#FF0000"),
type="value", title.legend="Units return %", title="Top 100 clients")

## 客户的累积消耗量
sales$Cum_Units <- cumsum(sales$Units) / sum(sales$Units) # 累积百分比
s <- seq(1, 800000, 100) # 约有80万个客户
ggplot()+geom_line(aes(x=s, y=sales$Cum_Units[s]))+
theme_bw(base_family = "STKaiti") +
scale_x_continuous(labels=function(x) paste(x/1000, "k"))+
ggtitle("客户分配")+ xlab("客户数量")+ylab("累积百分比(销量)")

## 前20万客户约贡献了75%的销售量
## 客户和销售站分析####
agencias_by_client <- traindata %>%
group_by(Cliente_ID) %>% #按照客户id分组
summarise(n_agencias = n_distinct(Agencia_ID)) %>% #多少个销售站
inner_join(cliente_tabla, by="Cliente_ID")
table(agencias_by_client$n_agencias)
##
## 1 2 3 4 5 9 62
## 844113 37510 3771 19 1 1 1
# 1 2 3 4 5 9 62
# 844113 37510 3771 19 1 1 1
# 大部分的客户只从一个销售站购买,只有几个客户购买狗的销售站 >= 5
agencias_by_client %>% filter(n_agencias %in% c(5, 9, 62)) #返回符合条件的行
## Source: local data table [3 x 3]
##
## Cliente_ID n_agencias NombreCliente
## (int) (int) (chr)
## 1 188391 9 DESAYUNOS ESCOLARES
## 2 653378 62 PUEBLA REMISION
## 3 1274327 5 COMERCIALIZADORA LA PUERTA DEL SOL
# Cliente_ID n_agencias NombreCliente
# (int) (int) (chr)
# 1 188391 9 DESAYUNOS ESCOLARES
# 2 653378 62 PUEBLA REMISION
# 3 1274327 5 COMERCIALIZADORA LA PUERTA DEL SOL
## 客户和购买渠道分析#####
clients_canals <- traindata %>%
group_by(Cliente_ID) %>%
summarise(n_canals = n_distinct(Canal_ID))
## 大多数客户只有一个购买渠道。不同的销售渠道可以为一个客户提供服务。
table(clients_canals$n_canals)
##
## 1 2 3 4
## 874022 6516 65 1
# 1 2 3 4
# 874022 6516 65 1
# 很少有销售站有同一个客户通过多个渠道。
clients_agencies_canals <- traindata %>%
group_by(Cliente_ID, Agencia_ID) %>%
summarise(n_canals = n_distinct(Canal_ID))
table(clients_agencies_canals$n_canals)
##
## 1 2 3
## 922108 3271 3
## 客户和路线分析#####
clients_routes <- traindata %>%
group_by(Cliente_ID) %>%
summarise(n_routes = n_distinct(Ruta_SAK))
## 大多数客户只有不到5个仓库的交货,但超过240个客户的工作与10个仓库或更多。
sum(clients_routes$n_routes >= 10)
## [1] 242
ggplot(clients_routes)+
geom_histogram(aes(x=n_routes), fill="red", color="black", alpha="0.5", binwidth=1)+
theme_bw(base_family = "STKaiti") +
scale_y_continuous(labels=function(x) paste(x/1000, "k"))+
ggtitle("客户和销售路线")+ xlab("路线数量")+ylab("客户数量")

## 对集团销售的产品进行分析#####
products <- traindata %>% group_by(Producto_ID) %>% #根据生产的产品进行分组
summarise(Units = sum(Venta_uni_hoy), # 销量
Pesos = sum(Venta_hoy), # 卖出的总钱数
Return_Units = sum(Dev_uni_proxima), # 被退回的总量
Return_Pesos = sum(Dev_proxima), #备退回的总钱数
Net = sum(Demanda_uni_equil)) %>% #调整后的我总需求
mutate(Avg_Pesos = Pesos / Units, # 每种产品的单价
Return_Rate = Return_Units / (Units+Return_Units)) %>% # 退货率
filter(!is.nan(Avg_Pesos)) %>% #剔除没有单价的商品
inner_join(producto_tabla, by="Producto_ID") %>%
arrange(desc(Units))
products$NombreProducto <- factor(as.character(products$NombreProducto), levels=products$NombreProducto)
# 销量前100的产品树图
treemap(products[1:100, ],
index=c("NombreProducto"), vSize="Units", vColor="Return_Rate",
palette=c("#FFFFFF","#FFFFFF","#FF0000"),
type="value", title.legend="Units return %", title="Top 100 products")

## 产品的家的密度分布
ggplot(products, aes(x=Avg_Pesos))+
geom_histogram(aes(y=..density..), fill="gray", color="black", alpha="0.8")+
geom_density(fill="red", alpha="0.3")+
theme_bw(base_family = "STKaiti") +
scale_x_continuous(lim=c(0, 50))+
ggtitle("产品单价的分布")+ xlab("平均单价")+ylab("密度")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 74 rows containing non-finite values (stat_bin).
## Warning: Removed 74 rows containing non-finite values (stat_density).

## 产品和销售站
products_agencies <- traindata %>% group_by(Agencia_ID) %>%
summarise(n_products = n_distinct(Producto_ID))
## 大多数销售站会卖100~200种产品
ggplot(products_agencies)+
geom_histogram(aes(x = n_products), fill="red", color="black", alpha="0.5", binwidth=10)+
theme_bw(base_family = "STKaiti") +
ggtitle("销售站出售的产品数量")+ xlab("产品数量(种)")+ylab("销售站数量(个)")

## 产品和销售路线
routes_products <- traindata %>% group_by(Producto_ID) %>%
summarise(n_routes = n_distinct(Ruta_SAK))
## 大部分的产品只有几条销售路线,只有几种产品的销售路线很多
ggplot(routes_products)+
geom_histogram(aes(x=n_routes), fill="red", color="black", alpha="0.5", binwidth=10)+
theme_bw(base_family = "STKaiti") +
ggtitle("产品和销售线路")+ xlab("销售线路(条)")+ylab("产品数量(种)")

## 产品和销售路线 top100
routes.products <- traindata %>% group_by(Ruta_SAK, Producto_ID) %>%
summarise(count=n(),
n_Agencias = n_distinct(Agencia_ID),
n_Clients = n_distinct(Cliente_ID),
Units=sum(Venta_uni_hoy),
Return_Units = sum(Dev_uni_proxima)) %>%
mutate(Return_Rate = Return_Units / (Units+Return_Units)) %>%
arrange(desc(Units))
top100routes <- routes$Ruta_SAK[1:100]
top100products <- products$Producto_ID[1:100]
ggplot(routes.products %>%
filter(Ruta_SAK %in% top100routes, Producto_ID %in% top100products))+
geom_point(aes(x=as.character(Ruta_SAK),
y=as.character(Producto_ID),
size=Units, color=Return_Rate))+
theme_bw(base_family = "STKaiti")+
scale_color_gradient(name="退回\n百分比", low="blue", high="red")+
scale_size_continuous(name = "销量",range = c(1,6)) +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank()) +
ggtitle("销量前100产品&线路")+ xlab("销售线路ID")+ylab("产品ID")

## 产品和销售路线 top50
top50routes <- routes$Ruta_SAK[1:50]
top50products <- products$Producto_ID[1:50]
ggplot(routes.products %>%
filter(Ruta_SAK %in% top50routes, Producto_ID %in% top50products))+
geom_point(aes(x=as.character(Ruta_SAK),
y=as.character(Producto_ID),
size=Units, color=Return_Rate))+
theme_bw(base_family = "STKaiti")+
scale_color_gradient(name="退回\n百分比", low="blue", high="red")+
scale_size_continuous(name = "销量",range = c(1,8)) +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank()) +
ggtitle("销量前50产品&线路")+ xlab("销售线路ID")+ylab("产品ID")

## 产品和客户
products_by_client <- traindata %>%
group_by(Cliente_ID) %>%
summarise(n_products = n_distinct(Producto_ID)) %>%
inner_join(cliente_tabla, by="Cliente_ID")
ggplot(products_by_client)+
geom_histogram(aes(x=n_products), fill="red", color="black", alpha="0.3", binwidth=2)+
theme_bw(base_family = "STKaiti")+
scale_y_continuous(labels=function(x)paste(x/1000, "k"))+
ggtitle("产品量所对应的客户量")+ xlab("产品数量(种)")+ylab("客户数量(位)")
